# R Options
options(stringsAsFactors=FALSE)
# Required libraries
library(dplyr)
library(tidyr)
library(ggplot2)
library(patchwork)
library(openxlsx)
library(ggpubr)
library(ggsci)
library(knitr)
library(kableExtra)
library(Seurat)
# Source plotting functions
source("functions_plotting.R")
source("functions_analysis.R")Open this code chunk to read all parameters that are set specifically for your project.
param = list()
# Project ID
param$project = "HTO_testDataset"
# Project-specific paths
param$path.out = "~/scrnaseq_Seurat_10x_pbmc_hto_GSE108313/"
#param$path.out = "~/scrnaseq_bfx1256/Seurat/"
if(!file.exists(param$path.out)) dir.create(param$path.out)
# Input data path in case Cell Ranger was run
param$path.data = "~/scrnaseq/testDatasets/10x_pbmc_hto_GSE108313/cellranger/"
#param$path.data = "~/scrnaseq_bfx1256/filtered_feature_bc_matrix/"
# This could look like this, where HTO1-3 are the IDs included raw dataset
# param$hto.names = setNames(c("NameA", "NameB", "NameC"), c("HTO1", "HTO2", "HTO3"))
param$hto.names = NULL
#param$hto.names = setNames(c("Parental", "BoneSeeking", "Reference"), c("HumanHashTag4", "HumanHashTag5", "HumanHashTag6"))
# Prefix of mitochondrial genes
param$mt = "^MT-"
# Main color to use for plots
param$col = "palevioletred"In this first section of the report, we read 10X RNA and HTO input data from the files produced by CellRanger:
* barcodes.tsv.gz: All cell barcodes
* features.tsv.gz: (Ensembl) ID, name, and type for each gene and HTO
* matrix.mtx.gz: Raw RNA and HTO counts
and setup a Seurat object.
# Load the dataset
sc.data = Read10X(param$path.data)
# If HTO names are not provided, set them to the HTO rownames of the input file
# HTO rownames should not contain special characters like '-' or '_', this will not work
# '_' will be translated to '-' by Seurat
# ggplot however won't be able to work with '-' in HTO names
if(is.null(param$hto.names)) {
param$hto.names = setNames(rownames(sc.data[["Antibody Capture"]]), rownames(sc.data[["Antibody Capture"]]))
}
# In the case of Hashtag Oligos, we keep only cells represented in both "Gene Expression" and "Antibody Capture" tables
joint.barcodes <- intersect(colnames(sc.data[["Gene Expression"]]), colnames(sc.data[["Antibody Capture"]]))
# Subset counts by joint cell barcodes
sc.data[["Gene Expression"]] = sc.data[["Gene Expression"]][,joint.barcodes]
sc.data[["Antibody Capture"]] = sc.data[["Antibody Capture"]][,joint.barcodes]
# Replace HTO rownames of the input file with provided HTO (nice) names
rownames(sc.data[["Antibody Capture"]]) = param$hto.names[rownames(sc.data[["Antibody Capture"]])]
# Initialize the Seurat object with the raw non-normalised data
sc = CreateSeuratObject(counts=sc.data[["Gene Expression"]], project=param$project, min.cells=3)
sc$HTO = CreateAssayObject(counts=sc.data[["Antibody Capture"]])
# Colors for HTO
n.hto = nrow(sc@assays$HTO)
param$col.hto.global = pal_npg()(3)
param$col.hto.all = pal_npg()(factorial(n.hto) + 1)
param$col.hto.collapsed = pal_npg()(n.hto + 2)
sc## An object of class Seurat
## 17607 features across 16916 samples within 2 assays
## Active assay: RNA (17599 features)
## 1 other assay present: HTO
## An object of class Seurat
## 17607 features across 3000 samples within 2 assays
## Active assay: RNA (17599 features)
## 1 other assay present: HTO
This section of the report shows how cells are assigned to their sample-of-origin.
We start the analysis by normalising raw HTO counts. HTO counts for each cell are divided by the total counts for that cell and multiplied by 10,000. This is then natural-log transformed.
We assign cells to sample-of-origin, annotate negative cells that cannot be assigned to any sample, and doublet cells that are assigned to two samples.
## Cutoff for htoA : 222 reads
## Cutoff for htoB : 53 reads
## Cutoff for htoC : 99 reads
## Cutoff for htoD : 286 reads
## Cutoff for htoE : 231 reads
## Cutoff for htoF : 221 reads
## Cutoff for htoG : 437 reads
## Cutoff for htoH : 127 reads
# Sort Idents levels for nicer plotting
levels = c("Negative", "Doublet", param$hto.names[length(param$hto.names):1])
Idents(sc) = factor(Idents(sc), levels=levels)
sc@meta.data$hash.ID = factor(sc@meta.data$hash.ID, levels=levels)
names(param$col.hto.collapsed) = levels
# HTO classification results
hash.ID.table = sc@meta.data %>% dplyr::count(hash.ID) %>% dplyr::rename(HTO=hash.ID) %>% mutate(Perc=round(n/sum(n)*100,2)) %>% as.data.frame
p1 = ggplot(sc@meta.data %>% dplyr::count(HTO_classification.global), aes(x="", y=n, fill=HTO_classification.global)) +
geom_bar(width=1, stat="identity") + coord_polar("y", start=0) +
xlab("") + ylab("")
p1 = plot.mystyle(p=p1, title="HTO global classification results", col=param$col.hto.global)
p2 = ggplot(sc@meta.data %>% dplyr::count(hash.ID), aes(x="", y=n, fill=hash.ID)) +
geom_bar(width=1, stat="identity") + coord_polar("y", start=0) +
xlab("") + ylab("")
p2 = plot.mystyle(p=p2, title="HTO classification results", col=param$col.hto.collapsed)
p1 + p2 + plot_annotation(title="HTO classification results") + gridExtra::tableGrob(hash.ID.table, rows=NULL)This section of the report visualises raw and normalised HTO data to understand whether the demultiplexing step has worked well.
# Distribution of HTO counts before and after normalisation
hto.t.raw = sc@assays$HTO@counts %>% as.data.frame %>% t %>% as.data.frame
hto.t.raw.pseudo = hto.t.raw + 1
hto.t.norm = sc@assays$HTO@data %>% as.data.frame %>% t %>% as.data.frame
p1 = ggplot(hto.t.raw.pseudo %>% pivot_longer(everything()), aes(x=name, y=value, fill=name)) + geom_violin() + scale_y_continuous(trans="log2") + xlab("") + ylab("")
p1 = plot.mystyle(p1, title="HTO raw counts", col=param$col.hto.collapsed, legend.title="HTO")
p2 = ggplot(hto.t.norm %>% pivot_longer(everything()), aes(x=name, y=value, fill=name)) + geom_violin() + xlab("") + ylab("")
p2 = plot.mystyle(p2, title="HTO normalised counts", col=param$col.hto.collapsed, legend.title="HTO")
p = p1 + p2 & theme(legend.position="bottom")
p + plot_annotation("HTO counts before and after normalisation") + plot_layout(guides = "collect")Pairs of raw (top) and normalised (bottom) HTO counts are visualised to confirm mutal exclusivity in singlet cells. Data points correspond to measured HTO counts per HTO, colours correspond to the assigned samples-of-origin.
n = df.all.col.combinations(x=hto.t.raw.pseudo, cell.classification=sc$hash.ID)
p = ggplot(n, aes(x=value1, y=value2, color=cell.classification)) + geom_point() +
scale_x_continuous(trans="log2") + scale_y_continuous(trans="log2") +
scale_color_manual(values=param$col.hto.collapsed)
p = plot.mystyle(p)
p = p + facet_grid(name2~name1, drop=FALSE) +
theme(strip.text.x = element_text(size=10, color="black"),
strip.text.y = element_text(size=10, color="black"),
strip.background = element_rect(colour="white", fill="lightgrey"),
legend.position="bottom") +
plot_annotation("Raw HTO counts")
pn = df.all.col.combinations(x=hto.t.norm, cell.classification=sc$hash.ID)
p = ggplot(n, aes(x=value1, y=value2, color=cell.classification)) + geom_point() +
scale_x_continuous(trans="log2") + scale_y_continuous(trans="log2") +
scale_color_manual(values=param$col.hto.collapsed)
p = plot.mystyle(p)
p = p + facet_grid(name2~name1, drop=FALSE) +
theme(strip.text.x = element_text(size=10, color="black"),
strip.text.y = element_text(size=10, color="black"),
strip.background = element_rect(colour="white", fill="lightgrey"),
legend.position="bottom") +
plot_annotation("Normalised HTO data")
pThe following ridge plots visualise the enrichment of assigned sample-of-origin for the respective normalised HTO counts.
# Group cells based on HTO classification
p = RidgePlot(sc, assay="HTO", features=rownames(sc@assays$HTO), ncol = 1, same.y.lims=TRUE, cols=param$col.hto.collapsed, combine=FALSE)
for(i in 1:length(p)) p[[i]] = plot.mystyle(p[[i]], legend.title="Classified cells")
p = wrap_plots(p, ncol = 2) + plot_annotation("Normalised HTO data")+ plot_layout(guides = "collect") & theme(legend.position="bottom")
pLastly, we compare the number of genes between classified cells.
# Number of features in the different cells
p = VlnPlot(sc, features="nFeature_RNA", idents=levels(Idents(sc)), ncol=3, pt.size=0) + xlab("") + geom_violin(color=NA)
p = plot.mystyle(p, title="nFeature_RNA of HTO-classified cells", col=param$col.hto.collapsed, legend.title="Classified cells")
pThis section of the report states the number of cells that remain after negative and doublet cells are removed.
## An object of class Seurat
## 17607 features across 2422 samples within 2 assays
## Active assay: RNA (17599 features)
## 1 other assay present: HTO
This section of the report provides first insights into your RNA dataset based on a preliminary pre-processing of the RNA data using the usual scRNA-seq workflow.
sc.all = prelim.analysis(sc=sc.all, mt="^MT-", pc.n=10)
sc = prelim.analysis(sc=sc, mt="^MT-", pc.n=10)We use a UMAP to visualise and explore a dataset. The goal is to place similar cells together in 2D space, and learn about the biology underlying the data. Cells are color-coded according to the assigned sample-of-origin.
Take care not to mis-read a UMAP:
For a nice read to intuitively understand UMAP, see https://pair-code.github.io/understanding-umap/.
p = DimPlot(sc.all, reduction="umap", group.by="HTO_classification", cols=param$col.hto.all)
p = plot.mystyle(p, title="UMAP, cells coloured by HTO classification, including doublets and negatives", legend.title="Cell classification", legend.position="bottom")
p# Plot UMAP after HTO filtering
p = DimPlot(sc, reduction="umap", group.by="HTO_classification", cols=param$col.hto.collapsed)
p = plot.mystyle(p, title="UMAP, cells coloured by HTO classification, singlets only", legend.title="Cell classification", legend.position="bottom")
pFinally, demultiplexed RNA data are written back to file.